# import libraries
import pandas as pd
import plotly.express as px
from datetime import datetime, timedelta
from matplotlib.pyplot import figure
import seaborn as sns
sns.set(rc={'figure.figsize':(10, 8)})
from matplotlib import pyplot as plt
import numpy as np
from scipy import stats as st
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#function for primary analysis of the dataset
def dataset (dataset):
dataset.columns = [x.lower().replace(' ', '_') for x in dataset.columns.values]
display(dataset.info())
print('*'*50)
display(dataset.describe())
print('*'*50)
display(dataset.sample(10))
print('*'*50)
display('Number of duplicate values', dataset.duplicated().sum())
visits = pd.read_csv('/datasets/game_actions.csv')
costs = pd.read_csv('/datasets/ad_costs.csv')
sources = pd.read_csv('/datasets/user_source.csv')
dataset(visits)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 135640 entries, 0 to 135639 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 event_datetime 135640 non-null object 1 event 135640 non-null object 2 building_type 127957 non-null object 3 user_id 135640 non-null object 4 project_type 1866 non-null object dtypes: object(5) memory usage: 5.2+ MB
None
**************************************************
| event_datetime | event | building_type | user_id | project_type | |
|---|---|---|---|---|---|
| count | 135640 | 135640 | 127957 | 135640 | 1866 |
| unique | 128790 | 3 | 3 | 13576 | 1 |
| top | 2020-05-07 03:10:43 | building | spaceport | ff841992-d93b-462e-bc6e-18c24a514f36 | satellite_orbital_assembly |
| freq | 4 | 127957 | 59325 | 22 | 1866 |
**************************************************
| event_datetime | event | building_type | user_id | project_type | |
|---|---|---|---|---|---|
| 103924 | 2020-05-14 06:06:48 | building | spaceport | 1e9dd017-2faa-496c-b996-a6c57fb3942c | NaN |
| 109152 | 2020-05-15 00:40:18 | building | spaceport | d83fa13f-ce84-4e1b-b079-5ed2b6ec7897 | NaN |
| 78579 | 2020-05-11 00:31:23 | building | spaceport | 1ae5c3be-0463-4ad2-93a6-1169c58f0aee | NaN |
| 50424 | 2020-05-09 00:43:26 | building | assembly_shop | 3546278c-c23c-4466-a901-7beabad73821 | NaN |
| 73108 | 2020-05-10 16:02:55 | building | spaceport | 5520f40c-925b-40ba-8fe6-e5c69f4da82a | NaN |
| 121952 | 2020-05-17 10:30:53 | building | spaceport | da3db52d-3401-47b7-b879-bbd40dac991b | NaN |
| 31 | 2020-05-04 00:06:16 | building | assembly_shop | 5c061bf9-92bb-4ec5-b01a-e648dc10a9c1 | NaN |
| 37968 | 2020-05-07 23:55:39 | building | spaceport | ec487be0-3318-44c2-a8b6-ea329bd077d8 | NaN |
| 71208 | 2020-05-10 13:04:19 | building | assembly_shop | e838492c-543e-431a-807b-e9204252af88 | NaN |
| 117713 | 2020-05-16 12:30:39 | building | research_center | 198d4bdc-6cab-458d-9164-c2fcab13d4da | NaN |
**************************************************
'Number of duplicate values'
1
The logs contain information about 13,576 players. Only 1 duplicate, can be neglected.
visits = visits.drop_duplicates()
visits['event_datetime'] = pd.to_datetime(visits['event_datetime']) # we bring the data to the datetime format
visits['project_type'].unique()
array([nan, 'satellite_orbital_assembly'], dtype=object)
visits.query('project_type=="satellite_orbital_assembly"').count()
event_datetime 1866 event 1866 building_type 0 user_id 1866 project_type 1866 dtype: int64
visits.query('event=="project"').count()
event_datetime 1866 event 1866 building_type 0 user_id 1866 project_type 1866 dtype: int64
The number of events with the type of implemented project "development of orbital assembly of satellites" coincides with the number of events containing information about the completion of the project. According to the team's comments, the omissions in the "project_type" column are due to the mechanics of data collection.
dataset(costs)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 28 entries, 0 to 27 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 source 28 non-null object 1 day 28 non-null object 2 cost 28 non-null float64 dtypes: float64(1), object(2) memory usage: 800.0+ bytes
None
**************************************************
| cost | |
|---|---|
| count | 28.000000 |
| mean | 271.556321 |
| std | 286.867650 |
| min | 23.314669 |
| 25% | 66.747365 |
| 50% | 160.056443 |
| 75% | 349.034473 |
| max | 969.139394 |
**************************************************
| source | day | cost | |
|---|---|---|---|
| 26 | youtube_channel_reklama | 2020-05-08 | 40.217907 |
| 22 | youtube_channel_reklama | 2020-05-04 | 259.073224 |
| 10 | instagram_new_adverts | 2020-05-06 | 173.071145 |
| 1 | facebook_ads | 2020-05-04 | 548.354480 |
| 8 | instagram_new_adverts | 2020-05-04 | 502.925451 |
| 21 | youtube_channel_reklama | 2020-05-03 | 454.224943 |
| 2 | facebook_ads | 2020-05-05 | 260.185754 |
| 14 | yandex_direct | 2020-05-03 | 969.139394 |
| 13 | instagram_new_adverts | 2020-05-09 | 46.775400 |
| 17 | yandex_direct | 2020-05-06 | 180.917099 |
**************************************************
'Number of duplicate values'
0
costs['day'] = pd.to_datetime(costs['day']) # we bring the data to the datetime format
costs['source'].unique()
array(['facebook_ads', 'instagram_new_adverts', 'yandex_direct',
'youtube_channel_reklama'], dtype=object)
display(costs['day'].max(), costs['day'].min())
Timestamp('2020-05-09 00:00:00')
Timestamp('2020-05-03 00:00:00')
In total, 4 advertising channels were used, the date range corresponds to the declared cohort data with a time lag between the user's involvement and the start of using the application in 1 day.
dataset(sources)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 13576 entries, 0 to 13575 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 13576 non-null object 1 source 13576 non-null object dtypes: object(2) memory usage: 212.2+ KB
None
**************************************************
| user_id | source | |
|---|---|---|
| count | 13576 | 13576 |
| unique | 13576 | 4 |
| top | 483e6bea-1db0-4d45-91b3-8f01b5d95c93 | yandex_direct |
| freq | 1 | 4817 |
**************************************************
| user_id | source | |
|---|---|---|
| 755 | 0de7acab-12c9-4498-b48a-ccd746915fc7 | facebook_ads |
| 971 | 11a91a61-405e-49c7-8911-da5d935b388c | youtube_channel_reklama |
| 857 | 0fba7003-2f85-4d75-a98a-cf4dd199e706 | facebook_ads |
| 2100 | 260a051d-765e-4ddb-b9fb-b04b2cbc515d | yandex_direct |
| 12133 | e4a081f1-8f41-4d29-bb02-20a5192488e8 | youtube_channel_reklama |
| 3719 | 448db445-6f82-49f2-87ee-6efcc7d604da | yandex_direct |
| 4412 | 517c8522-8e42-4ab2-9f77-c92785de9f1a | youtube_channel_reklama |
| 13292 | facade8f-2b33-46c2-9d49-9d7c3e4aecba | youtube_channel_reklama |
| 732 | 0d72e1bb-2d82-46c2-830e-8326345994cb | facebook_ads |
| 2208 | 27effca9-efae-43de-910f-2c1e56c9bb7a | yandex_direct |
**************************************************
'Number of duplicate values'
0
players_number = sources['user_id'].nunique()
players_number
13576
All user IDs are unique.
We have prepared the data for analysis by bringing it to the desired format and checking for omissions/duplicates.
events = len(visits['user_id'])
users = len(visits['user_id'].unique())
average_events_per_user = events / users
display('Total number of events', events)
display('Total number of users', users)
display('Average number of events per user', '{:.2f}'.format(average_events_per_user))
'Total number of events'
135639
'Total number of users'
13576
'Average number of events per user'
'9.99'
The number of unique user IDs in the session data coincides with the number of IDs in the data on the attraction channels.
visits['event_datetime'].hist(figsize = (13,5), bins=50)
plt.title('Histogram by date and time')
plt.ylabel('Number of observations')
plt.show()
The graph shows how the number of users increased while the advertising campaign was going on until May 10, after that there is a sharp jump down in the number of visits (apparently, some users leave after the first use), then users already flow away gradually.
visits_finished_stage_1 = visits.query('event=="finished_stage_1"')
visits_finished_stage_1['event_datetime'].hist(figsize = (15,7), bins=50)
plt.title('Histogram of level completion events by date and time')
plt.ylabel('Number of observations')
plt.show()
The distribution of the number of observations about the completion of the first level is close to normal. The majority of users completed the first level between May 13 and May 21 - it takes users several days to complete the level.
users
13576
visits['day_week'] = pd.DatetimeIndex(visits['event_datetime']).day_name()
temp = visits.groupby('day_week')['user_id'].nunique().reset_index()
temp['share'] = temp['user_id']/len(visits['user_id'].unique())*100
week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
temp['day_week'] = pd.Categorical(temp['day_week'], categories=week, ordered=True)
temp = temp.sort_values('day_week')
fig = px.bar(temp, x='day_week', y='share')
fig.update_layout(barmode='stack',
title_text='Distribution of active users by day of the week, %',
yaxis=dict(
title='% from all users',
titlefont_size=15,
tickfont_size=10,
), width=700, height=400)
fig.show()
On every day of the week, users are approximately equally active.
visits_building_type = visits.groupby('building_type')['user_id'].count().reset_index()
fig = px.bar(visits_building_type, x='building_type', y='user_id')
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Number of events for the selected building type',
yaxis=dict(
title='Number of buildings',
titlefont_size=15,
tickfont_size=10,
), width=700, height=400)
fig.show()
visits_event = visits.groupby('event')['user_id'].count().reset_index()
fig = px.bar(visits_event, x='event', y='user_id')
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Distribution of observations by event type',
yaxis=dict(
title='Number of players',
titlefont_size=15,
tickfont_size=10,
), width=700, height=400)
fig.show()
The most frequent type of event in the data is the construction of a building.
# record in a separate dataframe all the events about the players who completed the first level
completed = visits.query('event == "finished_stage_1"')
display('The number of players who completed the first level', len(completed))
'The number of players who completed the first level'
5817
share = len(completed['user_id']) / players_number
display('The percentage of players who completed the first level, total', '{:.2%}'.format(share))
'The percentage of players who completed the first level, total'
'42.85%'
share_project = len(visits.query('project_type == "satellite_orbital_assembly"'))/ players_number
display('Percentage of players who completed the first level through the implementation of the project (from all players)',
'{:.2%}'.format(share_project))
'Percentage of players who completed the first level through the implementation of the project (from all players)'
'13.74%'
share_victory = share - share_project
display('Percentage of players who completed the first level by defeating another player (from all players)',
'{:.2%}'.format(share_victory))
'Percentage of players who completed the first level by defeating another player (from all players)'
'29.10%'
At the moment, the creators of the game plan to show ads only on the screen with a choice of buildings: but most of the users complete the game by defeating another player. It is possible that some of the users who were dragged out by the game and moved to the second level will not see ads at all.
Let's see if we have players who were able to move to the second level without having a single building in the logs.
#save the IDs of the users who completed the project
project = visits.query('event == "project"')
users_project = project['user_id'].tolist()
#function for categorizing the method of passing the first level by the user
def define_completion_type (id):
if id in users_project:
return "project"
else:
return "victory"
#adding a parameter
completed['type_of_completion'] = completed.apply(
lambda x: define_completion_type(x.user_id
), axis=1
)
#let's count the number of "building" events for each user
building_count = visits.query(
'event == "building"').groupby('user_id').agg({'event':'count'}).reset_index().rename(
columns={'event':'building_count'}
)
building_count = pd.merge(building_count, completed, how="inner", on='user_id')
building_count.head(5)
| user_id | building_count | event_datetime | event | building_type | project_type | day_week | type_of_completion | |
|---|---|---|---|---|---|---|---|---|
| 0 | 001d39dc-366c-4021-9604-6a3b9ff01e25 | 8 | 2020-05-12 07:40:47 | finished_stage_1 | NaN | NaN | Tuesday | victory |
| 1 | 003ee7e8-6356-4286-a715-62f22a18d2a1 | 8 | 2020-05-17 21:47:49 | finished_stage_1 | NaN | NaN | Sunday | victory |
| 2 | 004074f8-251a-4776-922c-8348dca1a2f6 | 12 | 2020-05-16 19:57:40 | finished_stage_1 | NaN | NaN | Saturday | victory |
| 3 | 00464e5e-28ed-4be6-bf44-517ef7b3937b | 8 | 2020-05-15 16:02:01 | finished_stage_1 | NaN | NaN | Friday | victory |
| 4 | 0056c007-c082-43cc-bdef-afc49bca4cb9 | 3 | 2020-05-16 13:48:39 | finished_stage_1 | NaN | NaN | Saturday | victory |
#let's count the number of users who have moved to the next level without having a single building in the logs
temp = building_count.query('event=="finished_stage_1" & building_count==0')
print(len(temp))
0
There are no players in the cohort who complete the level without trying to build a single building, therefore, showing ads on the screen with the choice of the building object is a working idea. Let's see which buildings are chosen for construction by the players who have completed the level.
#save in a separate table the users who completed the level by building the object
visits_project = visits.query('event=="project"')
visits_project = visits_project['user_id'].to_frame()
#save information with user IDs and building type in a separate table
visits_building = visits.query('event=="building"')
visits_building = visits_building[['building_type', 'user_id']]
visits_building = visits_building.drop_duplicates()
visits_building
| building_type | user_id | |
|---|---|---|
| 0 | assembly_shop | 55e92310-cb8e-4754-b622-597e124b03de |
| 1 | assembly_shop | c07b1c10-f477-44dc-81dc-ec82254b1347 |
| 2 | assembly_shop | 6edd42cc-e753-4ff6-a947-2107cd560710 |
| 3 | assembly_shop | 92c69003-d60a-444a-827f-8cc51bf6bf4c |
| 4 | assembly_shop | cdc6bb92-0ccb-4490-9866-ef142f09139d |
| ... | ... | ... |
| 135604 | research_center | 70e18c96-221c-4ccc-8b5e-45adc4b7e1a2 |
| 135615 | research_center | a4f54b34-299f-495f-b29f-745465741366 |
| 135634 | research_center | 515c1952-99aa-4bca-a7ea-d0449eb5385a |
| 135635 | research_center | f21d179f-1c4b-437e-b9c6-ab1976907195 |
| 135637 | research_center | ed3e7d02-8a96-4be7-9998-e9813ff9c316 |
34478 rows × 2 columns
#supplement the table about the players who completed the level with the selected type of building
visits_building_nextlevel = pd.merge(visits_project, visits_building, how="inner", on='user_id')
visits_building_nextlevel = visits_building_nextlevel.drop_duplicates()
visits_building_nextlevel.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5598 entries, 0 to 5597 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 5598 non-null object 1 building_type 5598 non-null object dtypes: object(2) memory usage: 131.2+ KB
visits_building_nextlevel = visits_building_nextlevel.groupby('building_type')['user_id'].count().reset_index()
fig = px.bar(visits_building_nextlevel, x='building_type', y='user_id')
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Distribution of players who have completed the project by task type',
yaxis=dict(
title='Number of buildings',
titlefont_size=15,
tickfont_size=10,
), width=700, height=400)
fig.show()
Apparently, in order to complete the project, the player needs to build all 3 types of buildings.
We will find for each user the date of the first interaction with the game, as well as the date of the last event.
#calculate the time of the first event for each user
time = visits.groupby(by=["user_id"])['event_datetime'].min().reset_index()
time.rename(columns = {'event_datetime':'first_ts'}, inplace = True)
#calculate the time of the last event for each user
time_last = visits.groupby(by=["user_id"])['event_datetime'].max().reset_index()
time_last.rename(columns = {'event_datetime':'last_ts'}, inplace = True)
#let's combine it into one table by user ID
time = pd.merge(time, time_last, how="inner", on='user_id')
time['usage_duration'] = time['last_ts'] - time['first_ts']
display('Average time between events for 1 user', time['usage_duration'].mean())
display('Median time between events for 1 user', time['usage_duration'].median())
display('Maximum time between events for 1 user', time['usage_duration'].max())
display('Minimum time between events for 1 user', time['usage_duration'].min())
'Average time between events for 1 user'
Timedelta('10 days 16:13:07.579183853')
'Median time between events for 1 user'
Timedelta('10 days 12:30:52.500000')
'Maximum time between events for 1 user'
Timedelta('30 days 21:58:53')
'Minimum time between events for 1 user'
Timedelta('0 days 00:00:00')
On average, the difference between events in the cohort is 10 days, and is only a few hours different from the median. At the same time, there are players who have used the game for 30 days, and there are also those for whom there is only 1 event. Let's look at the example of 1 user, is there a difference between how the time of the "object is built" event is set and the event about the fact of passing the first level.
display(visits.sort_values(by='user_id',ascending=False).head(10))
| event_datetime | event | building_type | user_id | project_type | day_week | |
|---|---|---|---|---|---|---|
| 98034 | 2020-05-13 10:59:39 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Wednesday |
| 134719 | 2020-05-25 14:05:51 | finished_stage_1 | NaN | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Monday |
| 134614 | 2020-05-25 07:09:53 | building | research_center | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Monday |
| 117542 | 2020-05-16 11:39:58 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Saturday |
| 41421 | 2020-05-08 06:57:30 | building | assembly_shop | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Friday |
| 115943 | 2020-05-16 04:29:30 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Saturday |
| 111199 | 2020-05-15 08:36:25 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Friday |
| 80849 | 2020-05-11 07:09:49 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Monday |
| 134561 | 2020-05-25 03:55:06 | project | NaN | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | satellite_orbital_assembly | Monday |
| 53164 | 2020-05-09 06:00:38 | building | assembly_shop | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Saturday |
#let's take the first user as an example
visits.query('user_id=="ffff69cc-fec1-4fd3-9f98-93be1112a6b8"')
| event_datetime | event | building_type | user_id | project_type | day_week | |
|---|---|---|---|---|---|---|
| 41421 | 2020-05-08 06:57:30 | building | assembly_shop | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Friday |
| 53164 | 2020-05-09 06:00:38 | building | assembly_shop | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Saturday |
| 61421 | 2020-05-09 20:56:03 | building | assembly_shop | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Saturday |
| 80849 | 2020-05-11 07:09:49 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Monday |
| 98034 | 2020-05-13 10:59:39 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Wednesday |
| 111199 | 2020-05-15 08:36:25 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Friday |
| 115943 | 2020-05-16 04:29:30 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Saturday |
| 117542 | 2020-05-16 11:39:58 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Saturday |
| 123797 | 2020-05-17 22:48:25 | building | research_center | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Sunday |
| 133483 | 2020-05-23 06:00:10 | building | spaceport | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Saturday |
| 134257 | 2020-05-24 12:52:53 | building | research_center | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Sunday |
| 134561 | 2020-05-25 03:55:06 | project | NaN | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | satellite_orbital_assembly | Monday |
| 134614 | 2020-05-25 07:09:53 | building | research_center | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Monday |
| 134719 | 2020-05-25 14:05:51 | finished_stage_1 | NaN | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | NaN | Monday |
The user selected all three types of objects for construction several times, while about 10 hours passed between the events "Project completed" and "First level passed". For further analysis, the moment of passing the first level will be considered the event "finished_stage_1".
#let's add data about the time spent in the game, the first and last event to the dataframe
completed = pd.merge(completed, time, how="inner", on='user_id')
#adding a column about the duration of the passage in hours
completed['usage_duration_hours'] = completed['usage_duration'].dt.total_seconds()/3600
completed['usage_duration_hours'].describe()
count 5817.000000 mean 284.633212 std 97.625884 min 9.650278 25% 221.222500 50% 277.478889 75% 341.254444 max 741.981389 Name: usage_duration_hours, dtype: float64
The average time to complete the level is quite variable, the spread is approximately from 9.5 hours to 742. The average passage time is 285 hours, i.e. almost 12 days. Let's look at the distributions for the two ways of passing the level.
fig = plt.figure(figsize=(15,7))
completed.query('type_of_completion == "victory"')['usage_duration_hours'].hist(density=True, bins=30, alpha=0.9, label="Победа над игроком")
completed.query('type_of_completion == "project"')['usage_duration_hours'].hist(density=True, bins=30, alpha=0.5, color='pink', label="Завершение проекта")
plt.title('Histogram of the time of passing the level')
plt.legend(loc="upper right")
plt.xlabel('Number of hours')
plt.ylabel('Number of observations')
plt.show()
Both distributions are close to normal, which will allow us to use the T-criterion to compare the averages further. Let's look at the descriptive statistics for two samples.
completed.query('type_of_completion == "victory"')['usage_duration_hours'].describe()
count 3951.000000 mean 266.506688 std 98.298413 min 9.650278 25% 199.022083 50% 259.026944 75% 322.554861 max 741.981389 Name: usage_duration_hours, dtype: float64
completed.query('type_of_completion == "project"')['usage_duration_hours'].describe()
count 1866.000000 mean 323.013649 std 84.183883 min 123.374167 25% 262.832986 50% 312.761389 75% 373.789931 max 699.228611 Name: usage_duration_hours, dtype: float64
#visualize this on a graph
data_fig = completed
sns.set(rc={'figure.figsize':(10, 7)})
ax = sns.boxplot(x="type_of_completion", y="usage_duration_hours", data=data_fig, orient='v')
# add stripplot
ax = sns.stripplot(x="type_of_completion", y="usage_duration_hours", data=data_fig, color="orange", jitter=0.2, size=2.5)
# Calculate number of obs per group & median to position labels
medians = data_fig.groupby(['type_of_completion'])['usage_duration_hours'].median().values
nobs = data_fig.groupby("type_of_completion").size().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ["n: " + i for i in nobs]
# Add it to the plot
pos = range(len(nobs))
for tick,label in zip(pos,ax.get_xticklabels()):
plt.text(pos[tick], medians[tick] + 0.4, nobs[tick], horizontalalignment='center', size='medium', color='w', weight='semibold')
plt.xticks(rotation=45)
ax.set_ylim(-10,800)
plt.title('Time to complete the first level, depending on the method', fontsize=16)
plt.xlabel('Strategy', fontsize=14)
plt.ylabel('Transit time, hours', fontsize=14)
plt.show()
In general, there are many more people in the cohort who completed the level with a victory over another player, and on average it took them less time (both on average and median), but the variability of the time of passing through the victory over the player is much higher.
#we will save the IDs of users who have passed the first level in a separate list
completed_user_id = completed['user_id']
completed_user_id = completed_user_id.drop_duplicates().tolist()
#function for categorizing the passage of the first level by the user
def define_completion (id):
if id in completed_user_id:
return "completed"
else:
return "not_completed"
#adding a feature
visits['level'] = visits.apply(lambda x: define_completion(x.user_id), axis=1)
#record in a separate dataframe the events of all players who have not passed the first level
not_completed = visits.query('level == "not_completed"')
#let's add data about the time spent in the game, the first and last event to the dataframe
not_completed = pd.merge(not_completed, time, how="inner", on='user_id')
#adding a column about the duration of the passage in hours
not_completed['usage_duration_hours'] = not_completed['usage_duration'].dt.total_seconds()/3600
not_completed['usage_duration_hours'].describe()
count 67170.000000 mean 265.602491 std 99.109697 min 0.000000 25% 199.270833 50% 259.324167 75% 324.335556 max 698.023333 Name: usage_duration_hours, dtype: float64
There are users who fall off immediately (the minimum time spent in the game is 0 hours).
events = len(not_completed['user_id'])
users = len(not_completed['user_id'].unique())
average_events_per_user = events / users
display('Total number of events', events)
display('Total number of users', users)
display('Average number of events per user', '{:.2f}'.format(average_events_per_user))
'Total number of events'
67170
'Total number of users'
7759
'Average number of events per user'
'8.66'
Users fall off at about step 9.
temp = not_completed.groupby('building_type')['user_id'].count().reset_index()
fig = px.bar(temp, x='building_type', y='user_id')
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Events by building types for players who have not moved to the 2nd level',
yaxis=dict(
title='Number of events',
titlefont_size=15,
tickfont_size=10,
), width=700, height=400)
fig.show()
Building a research center is the most unpopular type of task. Let's look at the ratio of each type of building to the buildings of all users in the entire cohort.
funnel = visits.groupby('building_type').agg({'user_id':'nunique'})\
.sort_values(by='user_id', ascending=False).reset_index(level=[0,0])
#percentage of the total number of unique users
funnel['percent'] = funnel['user_id'] / visits['user_id'].nunique()
with pd.option_context('display.float_format', '{:.2%}'.format):
display(funnel)
| building_type | user_id | percent | |
|---|---|---|---|
| 0 | assembly_shop | 13576 | 100.00% |
| 1 | spaceport | 13231 | 97.46% |
| 2 | research_center | 7671 | 56.50% |
It is difficult for players to build a research center: at this stage, most of the players fall off.
with pd.option_context('display.float_format', '{:.2%}'.format):
display(
sources['source'].value_counts(normalize=True)
)
yandex_direct 35.48% instagram_new_adverts 24.65% facebook_ads 20.08% youtube_channel_reklama 19.78% Name: source, dtype: float64
Most of the users came through Yandex Direct, least of all through advertising on YouTube. The two most popular channels brought about 60% of users.
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(10,7)})
ax = sns.barplot(x="source", y="cost", data=costs)
plt.title('Advertising costs by channel', fontsize=16)
plt.xlabel('Source', fontsize=14)
plt.ylabel('Costs', fontsize=14)
plt.show()
The daily budget on the platform was about 300 USD on the three most popular platforms, while on YouTube it was about 150 USD.
costs_grouped=costs.groupby(by=["source"])['cost'].sum().reset_index()
with pd.option_context('display.float_format', '{:.0f}'.format):
display(
costs_grouped.sort_values(by='cost',ascending=False)
)
| source | cost | |
|---|---|---|
| 2 | yandex_direct | 2233 |
| 1 | instagram_new_adverts | 2161 |
| 0 | facebook_ads | 2141 |
| 3 | youtube_channel_reklama | 1068 |
Let's look at the average cost of attracting through channels.
costs_by_days = costs.groupby('day')['cost'].sum().reset_index()
fig = px.bar(costs_by_days, x='day', y='cost')
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Advertising costs by day',
yaxis=dict(
title='The amount of advertising costs',
titlefont_size=15,
tickfont_size=10,
), width=700, height=400)
fig.show()
Advertising costs decreased gradually throughout the week.
print('{:.2f}'.format(costs['cost'].sum()/players_number))
0.56
source_grouped=sources.groupby(by=["source"])['user_id'].count().reset_index()
source_grouped = pd.merge(costs_grouped, source_grouped, how="left", on='source')
source_grouped['CAC'] = source_grouped['cost'] / source_grouped['user_id']
source_grouped.sort_values("CAC", ascending=False)
| source | cost | user_id | CAC | |
|---|---|---|---|---|
| 0 | facebook_ads | 2140.904643 | 2726 | 0.785365 |
| 1 | instagram_new_adverts | 2161.441691 | 3347 | 0.645785 |
| 2 | yandex_direct | 2233.111449 | 4817 | 0.463590 |
| 3 | youtube_channel_reklama | 1068.119204 | 2686 | 0.397662 |
The most expensive channel by the average cost of attraction is facebook_ads, the least expensive is youtube_channel_reklama. The cost of attracting users via instagram_new_adverts and facebook_ads channels is higher than the average for the cohort, for yandex_direct and youtube_channel_reklama channels is lower.
time['first_ts_day'] = time['first_ts'].dt.date
temp = time.groupby('first_ts_day')['user_id'].nunique().reset_index()
temp['first_ts_day'] = temp['first_ts_day']+timedelta(days=1) #we will update the data for 1 day,
# because according to the comments of the team there is a time lag of 1 day between attracting and installing the game
costs_by_days['CAC'] = costs_by_days['cost'] / temp['user_id']
costs_by_days.sort_values(by='day')
| day | cost | CAC | |
|---|---|---|---|
| 0 | 2020-05-03 | 3302.451840 | 0.563558 |
| 1 | 2020-05-04 | 1865.004649 | 0.554896 |
| 2 | 2020-05-05 | 1029.431469 | 0.557353 |
| 3 | 2020-05-06 | 620.476519 | 0.559492 |
| 4 | 2020-05-07 | 391.852033 | 0.569552 |
| 5 | 2020-05-08 | 242.767553 | 0.551744 |
| 6 | 2020-05-09 | 151.592924 | 0.559383 |
It can be seen that marketing tried to keep the cost of attracting a client approximately the same throughout the entire advertising campaign.
User behavior
42.85% of the players in the cohort completed the first level, while 29.10% did it through defeating another player.
Building a research center is the most unpopular type of task, at this stage the most players fall off.
The average number of events per user who has not passed the first level is 8,7. The passage of the game through the completion of the project takes longer than through the victory over another player. We will check this in the future using the statistical method.
All players in the cohort chose an object to build - even those who completed the level through defeating another player. Therefore, with the current monetization model, there is not a single player who could not see the advertisement.
Marketing
We analyzed the distribution of channels by the number of attracted customers. Most of the customers came through yandex_direct (35% of users), 2 times more than through youtube_channel_reklama.
We calculated the average CAC in the whole cohort, in the section of channels and days of the advertising company. The cost of attracting a user was about the same. The facebook_ads channel turned out to be the most expensive, the most effective by this indicator - youtube_channel_reklama.
Let's evaluate the existing monetization model. It is supposed to show ads on the screen with a choice of buildings, for 1 such display we earn 0.07 CU. Let's calculate how much we would have earned in the cohort under study in a month by the number of events - completed buildings:
income_per_user = 0.07
total_income = income_per_user*len(visits.query('event=="building"'))
total_costs = costs['cost'].sum()
margin = (total_income-total_costs)/total_costs
print('Revenue from displaying ads under the current monetization model', total_income)
print('Average revenue by user', '{:.2f}'.format(total_income/users))
print('Total advertising costs', '{:.2f}'.format(total_costs))
print('Maximum ROMI under the current monetization model','{:.2%}'.format(margin))
Revenue from displaying ads under the current monetization model 8956.92 Average revenue by user 1.15 Total advertising costs 7603.58 Maximum ROMI under the current monetization model 17.80%
Considering that the cohort was not shown ads and not all users left as quickly as they could, our margin was 17.8%. However, some users could see the screen with the choice of building, but not complete the construction of the building, and the event did not get into the logs.
The average number of events per user in the logs is about 10 (9.99 pcs.). On average, attracting a user costs 0.56 yandex units, for 1 impression we earn 0.07 yandex units, therefore, the user must see the ad at least 8 times to pay off.
Let's see what margin is possible in the current cohort if does not show ads for the player's first building, but instead shows it on all screens of the level. Such a model can work, because after the first construction, the player will feel the taste of the game and will want to continue building other objects.
pivot = visits.pivot_table(values='level', columns='event', index=['user_id'], aggfunc = 'count').reset_index()
pivot
| event | user_id | building | finished_stage_1 | project |
|---|---|---|---|---|
| 0 | 0001f83c-c6ac-4621-b7f0-8a28b283ac30 | 13.0 | NaN | NaN |
| 1 | 00151b4f-ba38-44a8-a650-d7cf130a0105 | 9.0 | NaN | NaN |
| 2 | 001aaea6-3d14-43f1-8ca8-7f48820f17aa | 4.0 | NaN | NaN |
| 3 | 001d39dc-366c-4021-9604-6a3b9ff01e25 | 8.0 | 1.0 | NaN |
| 4 | 002f508f-67b6-479f-814b-b05f00d4e995 | 12.0 | NaN | NaN |
| ... | ... | ... | ... | ... |
| 13571 | ffef4fed-164c-40e1-bde1-3980f76d0fb5 | 9.0 | 1.0 | NaN |
| 13572 | fffab3da-da0e-4e30-ae62-10d0a2e24a4e | 16.0 | NaN | NaN |
| 13573 | fffb626c-5ab6-47c9-8113-2062a2f18494 | 9.0 | 1.0 | NaN |
| 13574 | ffff194a-56b7-4c12-860d-3485242ae7f5 | 7.0 | NaN | NaN |
| 13575 | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | 12.0 | 1.0 | 1.0 |
13576 rows × 4 columns
#function for excluding the event about the completion of the first building from the number of events of the building type
def reduce_build(building_number):
if building_number > 0:
building_number_new = building_number-1
else:
building_number_new = building_number
return building_number_new
#adding a new column to the working dataframe
pivot['building_for_ads'] = pivot['building'].apply(reduce_build)
pivot
| event | user_id | building | finished_stage_1 | project | building_for_ads |
|---|---|---|---|---|---|
| 0 | 0001f83c-c6ac-4621-b7f0-8a28b283ac30 | 13.0 | NaN | NaN | 12.0 |
| 1 | 00151b4f-ba38-44a8-a650-d7cf130a0105 | 9.0 | NaN | NaN | 8.0 |
| 2 | 001aaea6-3d14-43f1-8ca8-7f48820f17aa | 4.0 | NaN | NaN | 3.0 |
| 3 | 001d39dc-366c-4021-9604-6a3b9ff01e25 | 8.0 | 1.0 | NaN | 7.0 |
| 4 | 002f508f-67b6-479f-814b-b05f00d4e995 | 12.0 | NaN | NaN | 11.0 |
| ... | ... | ... | ... | ... | ... |
| 13571 | ffef4fed-164c-40e1-bde1-3980f76d0fb5 | 9.0 | 1.0 | NaN | 8.0 |
| 13572 | fffab3da-da0e-4e30-ae62-10d0a2e24a4e | 16.0 | NaN | NaN | 15.0 |
| 13573 | fffb626c-5ab6-47c9-8113-2062a2f18494 | 9.0 | 1.0 | NaN | 8.0 |
| 13574 | ffff194a-56b7-4c12-860d-3485242ae7f5 | 7.0 | NaN | NaN | 6.0 |
| 13575 | ffff69cc-fec1-4fd3-9f98-93be1112a6b8 | 12.0 | 1.0 | 1.0 | 11.0 |
13576 rows × 5 columns
total_income = income_per_user * (
pivot['building_for_ads'].sum()+pivot['finished_stage_1'].sum()+pivot['project'].sum()
)
margin = (total_income-total_costs)/total_costs
print('Revenue from displaying ads under the proposed monetization model', '{:.2f}'.format(total_income))
print('Average revenue by user', '{:.2f}'.format(total_income/users))
print('Total advertising costs', '{:.2f}'.format(total_costs))
print('Maximum ROMI under the proposed monetization model','{:.2%}'.format(margin))
Revenue from displaying ads under the proposed monetization model 8544.41 Average revenue by user 1.10 Total advertising costs 7603.58 Maximum ROMI under the proposed monetization model 12.37%
The margin turns out to be less than with a conservative assessment of the current monetization model. This option can be a working one if it is possible to significantly reduce the CAC by optimizing the marketing strategy. Let's assume that we will still show ads on the screen with the completion of the construction of the 1st object:
total_income = income_per_user * (
pivot['building'].sum()+pivot['finished_stage_1'].sum()+pivot['project'].sum()
)
margin = (total_income-total_costs)/total_costs
print('Revenue from displaying ads under the proposed monetization model', '{:.2f}'.format(total_income))
print('Average revenue by user', '{:.2f}'.format(total_income/users))
print('Total advertising costs', '{:.2f}'.format(total_costs))
print('Maximum ROMI under the proposed monetization model','{:.2%}'.format(margin))
Revenue from displaying ads under the proposed monetization model 9494.73 Average revenue by user 1.22 Total advertising costs 7603.58 Maximum ROMI under the proposed monetization model 24.87%
We can use this model.
This is a hypothesis about the equality of two averages, let's use the t-test.
H0 - the average passage time between users who complete a level through the implementation of the project and users who complete a level by defeating another player does not differ.
H1 - the average passage time is different.
sample_1 = completed.query('type_of_completion == "victory"')['usage_duration_hours']
sample_2 = completed.query('type_of_completion == "project"')['usage_duration_hours']
alpha = .05
results = st.ttest_ind(
sample_1,
sample_2)
print('p-value:', results.pvalue)
if results.pvalue < alpha:
print("Reject the null hypothesis")
else:
print("Cannot reject the null hypothesis")
p-value: 7.256845329495443e-98 Reject the null hypothesis
There is a difference between the two aggregates.
Perhaps before the weekend and on weekends, people have less stress and distractions, and they are ready to spend more time in the game, which means they will pass the first level in fewer days.
This is a hypothesis about the equality of two averages, let's use the t-test.
H0 - the average transit time between users who logged in to the app for the first time on a weekend and users who logged in for the first time on weekdays does not differ.
H1 - the average passage time is different.
completed['first_ts_dayweek'] = pd.DatetimeIndex(completed['first_ts']).day_name()
#divide the users who have completed the game by the days of the week
workdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday']
weekends = ['Friday', 'Saturday', 'Sunday']
sample_1 = completed.query('first_ts_dayweek in @workdays')['usage_duration_hours']
sample_2 = completed.query('first_ts_dayweek in @weekends')['usage_duration_hours']
alpha = .05
results = st.ttest_ind(
sample_1,
sample_2)
print('p-value:', results.pvalue)
if results.pvalue < alpha:
print("Reject the null hypothesis")
else:
print("Cannot reject the null hypothesis")
p-value: 2.64064471749543e-05 Reject the null hypothesis
There is a difference between the two aggregates.
print('Average time to complete the game for players who came in during the working week', sample_1.mean())
Average time to complete the game for players who came in during the working week 286.1917059572618
print('Average passing time of the game for players who came at the end of the week', sample_2.mean())
Average passing time of the game for players who came at the end of the week 266.090238975117
The difference is almost a day.
For marketing, it is recommended to reconsider the strategy on the facebook_ads channel: the cost of attracting on this channel is 40% higher than the average for the cohort. The most effective channel in terms of the cost of attracting users is youtube_channel_reklama, although it had the smallest budget. The marketing department should continue to experiment with this channel.
Hypotheses were tested about the difference in the average time of passing the level between users with different strategies and users who came on different days of the week. In both cases, the difference was statistically significant.
As a monetization model, it is recommended to test the display of advertising at all stages of the game, except for choosing the first building as more enticing to the player.